library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(geosphere)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following object is masked from 'package:base':
##
## date
masterdata <- read.csv("new_MASTER_01_data.csv")
#sample_01_data <- read.csv("sample_01_data.csv")
#masterdata$newStartTime <- sample_01_data$starttime
#masterdata$newStopTime <- sample_01_data$stoptime
#write.csv(masterdata,"new_MASTER_01_data.csv")
summary(masterdata)
## X tripduration start.station.id
## Min. : 1 Min. : 61.0 519 : 1558
## 1st Qu.: 51380 1st Qu.: 363.0 497 : 1227
## Median :102759 Median : 616.0 3255 : 1200
## Mean :102759 Mean : 992.7 285 : 1145
## 3rd Qu.:154138 3rd Qu.: 1081.0 402 : 1125
## Max. :205517 Max. :2678003.0 435 : 1089
## (Other):198173
## start.station.name start.station.latitude start.station.longitude
## Pershing Square North: 1558 Min. :40.66 Min. :-74.03
## E 17 St & Broadway : 1227 1st Qu.:40.72 1st Qu.:-74.00
## 8 Ave & W 31 St : 1200 Median :40.74 Median :-73.99
## Broadway & E 14 St : 1145 Mean :40.74 Mean :-73.98
## Broadway & E 22 St : 1125 3rd Qu.:40.76 3rd Qu.:-73.97
## W 21 St & 6 Ave : 1089 Max. :40.86 Max. :-73.89
## (Other) :198173
## end.station.id end.station.name end.station.latitude
## 519 : 1604 Pershing Square North: 1604 Min. :40.66
## 497 : 1254 E 17 St & Broadway : 1254 1st Qu.:40.72
## 402 : 1194 Broadway & E 22 St : 1194 Median :40.74
## 3255 : 1169 8 Ave & W 31 St : 1169 Mean :40.74
## 285 : 1157 Broadway & E 14 St : 1157 3rd Qu.:40.76
## 426 : 1120 West St & Chambers St: 1120 Max. :40.86
## (Other):198019 (Other) :198019
## end.station.longitude bikeid usertype birth.year
## Min. :-74.05 Min. :14529 Customer : 28805 Min. :1886
## 1st Qu.:-74.00 1st Qu.:25323 Subscriber:176712 1st Qu.:1969
## Median :-73.99 Median :30947 Median :1983
## Mean :-73.98 Mean :29669 Mean :1980
## 3rd Qu.:-73.97 3rd Qu.:35053 3rd Qu.:1990
## Max. :-73.89 Max. :42046 Max. :2003
##
## gender AWND AWND_ATTRIBUTES PRCP
## Min. :0.000 Min. : 1.120 : 22301 Min. :0.000
## 1st Qu.:1.000 1st Qu.: 2.910 ,,W:183216 1st Qu.:0.000
## Median :1.000 Median : 4.030 Median :0.000
## Mean :1.164 Mean : 4.385 Mean :0.106
## 3rd Qu.:1.000 3rd Qu.: 5.140 3rd Qu.:0.040
## Max. :2.000 Max. :12.750 Max. :1.830
## NA's :22301
## PRCP_ATTRIBUTES SNOW SNOW_ATTRIBUTES SNWD
## ,,W,2400 :186524 Min. :0.000 : 545 Min. :0.00000
## T,,W,2400: 18993 1st Qu.:0.000 ,,W,2400 :201841 1st Qu.:0.00000
## Median :0.000 T,,W,2400: 3131 Median :0.00000
## Mean :0.019 Mean :0.02829
## 3rd Qu.:0.000 3rd Qu.:0.00000
## Max. :4.000 Max. :3.90000
## NA's :545
## SNWD_ATTRIBUTES TAVG TAVG_ATTRIBUTES TMAX
## ,,W,2400 :204127 Mode:logical Mode:logical Min. :14.00
## T,,W,2400: 1390 NA's:205517 NA's:205517 1st Qu.:57.00
## Median :71.00
## Mean :68.17
## 3rd Qu.:81.00
## Max. :95.00
##
## TMAX_ATTRIBUTES TMIN TMIN_ATTRIBUTES WDF2
## ,,W:205517 Min. : 2.00 ,,W:205517 Min. : 10.0
## 1st Qu.:42.00 1st Qu.: 60.0
## Median :56.00 Median :220.0
## Mean :53.63 Mean :182.2
## 3rd Qu.:67.00 3rd Qu.:280.0
## Max. :82.00 Max. :360.0
## NA's :22301
## WDF2_ATTRIBUTES WDF5 WDF5_ATTRIBUTES WSF2
## : 22301 Min. : 10.0 : 22700 Min. : 6.90
## ,,W:183216 1st Qu.: 70.0 ,,W:182817 1st Qu.:10.10
## Median :220.0 Median :12.10
## Mean :183.6 Mean :12.81
## 3rd Qu.:270.0 3rd Qu.:15.00
## Max. :360.0 Max. :25.10
## NA's :22700 NA's :22301
## WSF2_ATTRIBUTES WSF5 WSF5_ATTRIBUTES WT01
## : 22301 Min. :11.00 : 22700 Min. :1
## ,,W:183216 1st Qu.:17.00 ,,W:182817 1st Qu.:1
## Median :19.90 Median :1
## Mean :20.77 Mean :1
## 3rd Qu.:23.00 3rd Qu.:1
## Max. :40.90 Max. :1
## NA's :22700 NA's :123167
## WT01_ATTRIBUTES WT02 WT02_ATTRIBUTES WT03
## :123167 Min. :1 :201690 Min. :1
## ,,W: 82350 1st Qu.:1 ,,W: 3827 1st Qu.:1
## Median :1 Median :1
## Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1
## NA's :201690 NA's :186419
## WT03_ATTRIBUTES WT06 WT06_ATTRIBUTES WT08
## :186419 Min. :1 :204101 Min. :1
## ,,W: 19098 1st Qu.:1 ,,W: 1416 1st Qu.:1
## Median :1 Median :1
## Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1
## NA's :204101 NA's :172801
## WT08_ATTRIBUTES newStartTime
## :172801 2019-03-01 17:41:27.7210: 2
## ,,W: 32716 2019-07-31 17:48:23.5580: 2
## 2019-01-01 00:35:03.5980: 1
## 2019-01-01 01:14:01.5150: 1
## 2019-01-01 01:59:10.1080: 1
## 2019-01-01 02:47:03.7040: 1
## (Other) :205509
## newStopTime
## 2019-05-28 09:10:01.3380: 2
## 2019-07-12 08:43:08.3900: 2
## 2019-01-01 00:38:10.6250: 1
## 2019-01-01 01:58:41.1290: 1
## 2019-01-01 02:12:34.9820: 1
## 2019-01-01 02:55:16.4380: 1
## (Other) :205509
str(masterdata)
## 'data.frame': 205517 obs. of 48 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ tripduration : int 110 1067 325 552 282 1150 178 777 423 144 ...
## $ start.station.id : Factor w/ 906 levels "116","119","120",..: 253 838 98 855 645 850 250 873 214 16 ...
## $ start.station.name : Factor w/ 908 levels "1 Ave & E 110 St",..: 868 208 409 199 747 717 785 126 155 205 ...
## $ start.station.latitude : num 40.8 40.8 40.8 40.7 40.7 ...
## $ start.station.longitude: num -74 -74 -74 -74 -74 ...
## $ end.station.id : Factor w/ 906 levels "116","119","120",..: 213 586 543 787 628 71 423 421 273 16 ...
## $ end.station.name : Factor w/ 908 levels "1 Ave & E 110 St",..: 863 200 390 193 356 677 643 398 289 207 ...
## $ end.station.latitude : num 40.8 40.7 40.8 40.7 40.7 ...
## $ end.station.longitude : num -74 -74 -74 -74 -74 ...
## $ bikeid : int 38891 38269 14654 15101 32868 30584 32492 30258 36783 36111 ...
## $ usertype : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 2 2 2 2 2 1 ...
## $ birth.year : int 1989 1965 1990 1977 1996 1988 1954 1989 1977 1961 ...
## $ gender : int 1 1 1 1 1 1 1 1 1 1 ...
## $ AWND : num 3.36 5.37 2.91 1.79 2.91 2.91 2.68 2.91 7.83 4.47 ...
## $ AWND_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ PRCP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PRCP_ATTRIBUTES : Factor w/ 2 levels ",,W,2400","T,,W,2400": 2 2 1 2 1 1 1 1 1 1 ...
## $ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNOW_ATTRIBUTES : Factor w/ 3 levels "",",,W,2400",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ SNWD : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNWD_ATTRIBUTES : Factor w/ 2 levels ",,W,2400","T,,W,2400": 1 1 1 1 1 1 1 1 1 1 ...
## $ TAVG : logi NA NA NA NA NA NA ...
## $ TAVG_ATTRIBUTES : logi NA NA NA NA NA NA ...
## $ TMAX : int 87 39 70 87 85 85 80 88 49 60 ...
## $ TMAX_ATTRIBUTES : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ TMIN : int 73 32 52 75 72 68 63 75 33 38 ...
## $ TMIN_ATTRIBUTES : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WDF2 : int 70 250 40 60 220 290 150 140 10 260 ...
## $ WDF2_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WDF5 : int 40 220 40 70 220 290 150 140 360 260 ...
## $ WDF5_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WSF2 : num 8.9 13 8.9 10.1 12.1 8.9 8.9 8.9 16.1 13 ...
## $ WSF2_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WSF5 : num 17 19.9 13 13 19 15 16.1 15 25.1 23 ...
## $ WSF5_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WT01 : int 1 NA NA NA NA NA NA NA NA NA ...
## $ WT01_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 1 1 1 1 1 1 1 1 1 ...
## $ WT02 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT02_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT03 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT03_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT06 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT06_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT08 : int NA 1 NA NA NA NA 1 NA NA NA ...
## $ WT08_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 2 1 1 1 1 2 1 1 1 ...
## $ newStartTime : Factor w/ 205515 levels "2019-01-01 00:35:03.5980",..: 124947 197101 150074 92205 108911 100289 70041 126789 29559 28200 ...
## $ newStopTime : Factor w/ 205515 levels "2019-01-01 00:38:10.6250",..: 124930 197107 150056 92205 108899 100292 70035 126787 29556 28200 ...
#is the date data gone?
#summary(read.csv("sample_01_data.csv"))
#convert columns to factors as needed
masterdata$bikeid <- as.factor(masterdata$bikeid)
masterdata$gender <- as.factor(masterdata$gender)
masterdata$gender <- as.factor(ifelse(masterdata$gender == "0", "Unknown", ifelse(masterdata$gender == "1", "Male", "Female")))
masterdata$X <- NULL
masterdata$starttime <- NULL
masterdata$stoptime <- NULL
masterdata$newStartTime = as.POSIXct(strptime(masterdata$newStartTime, "%Y-%m-%d %H:%M:%S"))
masterdata$newStopTime = as.POSIXct(strptime(masterdata$newStopTime, "%Y-%m-%d %H:%M:%S"))
masterdata$newStartDate <- as.Date(masterdata$newStartTime)
masterdata$newStopDate <- as.Date(masterdata$newStopTime)
#distance
masterstart <- as.data.frame(matrix(nrow = 205517, ncol = 0))
masterstart$startlong <- as.numeric(masterdata$start.station.longitude)
masterstart$startlat <- as.numeric(masterdata$start.station.latitude)
masterend <- as.data.frame(matrix(nrow = 205517, ncol = 0))
masterend$endlong <- masterdata$end.station.longitude
masterend$endlat <- masterdata$end.station.latitude
masterdata$distanceH <- distHaversine(masterstart, masterend, r=6378137)
masterend <- NULL
masterstart <- NULL
str(masterdata)
## 'data.frame': 205517 obs. of 50 variables:
## $ tripduration : int 110 1067 325 552 282 1150 178 777 423 144 ...
## $ start.station.id : Factor w/ 906 levels "116","119","120",..: 253 838 98 855 645 850 250 873 214 16 ...
## $ start.station.name : Factor w/ 908 levels "1 Ave & E 110 St",..: 868 208 409 199 747 717 785 126 155 205 ...
## $ start.station.latitude : num 40.8 40.8 40.8 40.7 40.7 ...
## $ start.station.longitude: num -74 -74 -74 -74 -74 ...
## $ end.station.id : Factor w/ 906 levels "116","119","120",..: 213 586 543 787 628 71 423 421 273 16 ...
## $ end.station.name : Factor w/ 908 levels "1 Ave & E 110 St",..: 863 200 390 193 356 677 643 398 289 207 ...
## $ end.station.latitude : num 40.8 40.7 40.8 40.7 40.7 ...
## $ end.station.longitude : num -74 -74 -74 -74 -74 ...
## $ bikeid : Factor w/ 19094 levels "14529","14530",..: 16053 15465 97 422 11882 9967 11539 9673 14577 14187 ...
## $ usertype : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 2 2 2 2 2 1 ...
## $ birth.year : int 1989 1965 1990 1977 1996 1988 1954 1989 1977 1961 ...
## $ gender : Factor w/ 3 levels "Female","Male",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ AWND : num 3.36 5.37 2.91 1.79 2.91 2.91 2.68 2.91 7.83 4.47 ...
## $ AWND_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ PRCP : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PRCP_ATTRIBUTES : Factor w/ 2 levels ",,W,2400","T,,W,2400": 2 2 1 2 1 1 1 1 1 1 ...
## $ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNOW_ATTRIBUTES : Factor w/ 3 levels "",",,W,2400",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ SNWD : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNWD_ATTRIBUTES : Factor w/ 2 levels ",,W,2400","T,,W,2400": 1 1 1 1 1 1 1 1 1 1 ...
## $ TAVG : logi NA NA NA NA NA NA ...
## $ TAVG_ATTRIBUTES : logi NA NA NA NA NA NA ...
## $ TMAX : int 87 39 70 87 85 85 80 88 49 60 ...
## $ TMAX_ATTRIBUTES : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ TMIN : int 73 32 52 75 72 68 63 75 33 38 ...
## $ TMIN_ATTRIBUTES : Factor w/ 1 level ",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WDF2 : int 70 250 40 60 220 290 150 140 10 260 ...
## $ WDF2_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WDF5 : int 40 220 40 70 220 290 150 140 360 260 ...
## $ WDF5_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WSF2 : num 8.9 13 8.9 10.1 12.1 8.9 8.9 8.9 16.1 13 ...
## $ WSF2_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WSF5 : num 17 19.9 13 13 19 15 16.1 15 25.1 23 ...
## $ WSF5_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 2 2 2 2 2 2 2 2 2 ...
## $ WT01 : int 1 NA NA NA NA NA NA NA NA NA ...
## $ WT01_ATTRIBUTES : Factor w/ 2 levels "",",,W": 2 1 1 1 1 1 1 1 1 1 ...
## $ WT02 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT02_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT03 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT03_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT06 : int NA NA NA NA NA NA NA NA NA NA ...
## $ WT06_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 1 1 1 1 1 1 1 1 1 ...
## $ WT08 : int NA 1 NA NA NA NA 1 NA NA NA ...
## $ WT08_ATTRIBUTES : Factor w/ 2 levels "",",,W": 1 2 1 1 1 1 2 1 1 1 ...
## $ newStartTime : POSIXct, format: "2019-08-17 13:10:36" "2019-12-04 18:48:45" ...
## $ newStopTime : POSIXct, format: "2019-08-17 13:12:27" "2019-12-04 19:06:32" ...
## $ newStartDate : Date, format: "2019-08-17" "2019-12-04" ...
## $ newStopDate : Date, format: "2019-08-17" "2019-12-05" ...
## $ distanceH : num 413 2567 922 657 1099 ...
summary(masterdata)
## tripduration start.station.id start.station.name
## Min. : 61.0 519 : 1558 Pershing Square North: 1558
## 1st Qu.: 363.0 497 : 1227 E 17 St & Broadway : 1227
## Median : 616.0 3255 : 1200 8 Ave & W 31 St : 1200
## Mean : 992.7 285 : 1145 Broadway & E 14 St : 1145
## 3rd Qu.: 1081.0 402 : 1125 Broadway & E 22 St : 1125
## Max. :2678003.0 435 : 1089 W 21 St & 6 Ave : 1089
## (Other):198173 (Other) :198173
## start.station.latitude start.station.longitude end.station.id
## Min. :40.66 Min. :-74.03 519 : 1604
## 1st Qu.:40.72 1st Qu.:-74.00 497 : 1254
## Median :40.74 Median :-73.99 402 : 1194
## Mean :40.74 Mean :-73.98 3255 : 1169
## 3rd Qu.:40.76 3rd Qu.:-73.97 285 : 1157
## Max. :40.86 Max. :-73.89 426 : 1120
## (Other):198019
## end.station.name end.station.latitude end.station.longitude
## Pershing Square North: 1604 Min. :40.66 Min. :-74.05
## E 17 St & Broadway : 1254 1st Qu.:40.72 1st Qu.:-74.00
## Broadway & E 22 St : 1194 Median :40.74 Median :-73.99
## 8 Ave & W 31 St : 1169 Mean :40.74 Mean :-73.98
## Broadway & E 14 St : 1157 3rd Qu.:40.76 3rd Qu.:-73.97
## West St & Chambers St: 1120 Max. :40.86 Max. :-73.89
## (Other) :198019
## bikeid usertype birth.year gender
## 35306 : 44 Customer : 28805 Min. :1886 Female : 49419
## 34019 : 41 Subscriber:176712 1st Qu.:1969 Male :140370
## 34958 : 41 Median :1983 Unknown: 15728
## 35029 : 41 Mean :1980
## 35324 : 41 3rd Qu.:1990
## 33885 : 40 Max. :2003
## (Other):205269
## AWND AWND_ATTRIBUTES PRCP PRCP_ATTRIBUTES
## Min. : 1.120 : 22301 Min. :0.000 ,,W,2400 :186524
## 1st Qu.: 2.910 ,,W:183216 1st Qu.:0.000 T,,W,2400: 18993
## Median : 4.030 Median :0.000
## Mean : 4.385 Mean :0.106
## 3rd Qu.: 5.140 3rd Qu.:0.040
## Max. :12.750 Max. :1.830
## NA's :22301
## SNOW SNOW_ATTRIBUTES SNWD SNWD_ATTRIBUTES
## Min. :0.000 : 545 Min. :0.00000 ,,W,2400 :204127
## 1st Qu.:0.000 ,,W,2400 :201841 1st Qu.:0.00000 T,,W,2400: 1390
## Median :0.000 T,,W,2400: 3131 Median :0.00000
## Mean :0.019 Mean :0.02829
## 3rd Qu.:0.000 3rd Qu.:0.00000
## Max. :4.000 Max. :3.90000
## NA's :545
## TAVG TAVG_ATTRIBUTES TMAX TMAX_ATTRIBUTES TMIN
## Mode:logical Mode:logical Min. :14.00 ,,W:205517 Min. : 2.00
## NA's:205517 NA's:205517 1st Qu.:57.00 1st Qu.:42.00
## Median :71.00 Median :56.00
## Mean :68.17 Mean :53.63
## 3rd Qu.:81.00 3rd Qu.:67.00
## Max. :95.00 Max. :82.00
##
## TMIN_ATTRIBUTES WDF2 WDF2_ATTRIBUTES WDF5
## ,,W:205517 Min. : 10.0 : 22301 Min. : 10.0
## 1st Qu.: 60.0 ,,W:183216 1st Qu.: 70.0
## Median :220.0 Median :220.0
## Mean :182.2 Mean :183.6
## 3rd Qu.:280.0 3rd Qu.:270.0
## Max. :360.0 Max. :360.0
## NA's :22301 NA's :22700
## WDF5_ATTRIBUTES WSF2 WSF2_ATTRIBUTES WSF5
## : 22700 Min. : 6.90 : 22301 Min. :11.00
## ,,W:182817 1st Qu.:10.10 ,,W:183216 1st Qu.:17.00
## Median :12.10 Median :19.90
## Mean :12.81 Mean :20.77
## 3rd Qu.:15.00 3rd Qu.:23.00
## Max. :25.10 Max. :40.90
## NA's :22301 NA's :22700
## WSF5_ATTRIBUTES WT01 WT01_ATTRIBUTES WT02
## : 22700 Min. :1 :123167 Min. :1
## ,,W:182817 1st Qu.:1 ,,W: 82350 1st Qu.:1
## Median :1 Median :1
## Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1
## NA's :123167 NA's :201690
## WT02_ATTRIBUTES WT03 WT03_ATTRIBUTES WT06
## :201690 Min. :1 :186419 Min. :1
## ,,W: 3827 1st Qu.:1 ,,W: 19098 1st Qu.:1
## Median :1 Median :1
## Mean :1 Mean :1
## 3rd Qu.:1 3rd Qu.:1
## Max. :1 Max. :1
## NA's :186419 NA's :204101
## WT06_ATTRIBUTES WT08 WT08_ATTRIBUTES newStartTime
## :204101 Min. :1 :172801 Min. :2019-01-01 00:35:03
## ,,W: 1416 1st Qu.:1 ,,W: 32716 1st Qu.:2019-05-03 06:30:28
## Median :1 Median :2019-07-18 16:48:45
## Mean :1 Mean :2019-07-12 13:31:09
## 3rd Qu.:1 3rd Qu.:2019-09-23 18:04:58
## Max. :1 Max. :2019-12-31 23:33:21
## NA's :172801
## newStopTime newStartDate newStopDate
## Min. :2019-01-01 00:38:10 Min. :2019-01-01 Min. :2019-01-01
## 1st Qu.:2019-05-03 06:49:10 1st Qu.:2019-05-03 1st Qu.:2019-05-03
## Median :2019-07-18 16:59:56 Median :2019-07-18 Median :2019-07-18
## Mean :2019-07-12 13:47:42 Mean :2019-07-12 Mean :2019-07-12
## 3rd Qu.:2019-09-23 18:18:30 3rd Qu.:2019-09-23 3rd Qu.:2019-09-23
## Max. :2020-01-02 09:26:42 Max. :2020-01-01 Max. :2020-01-02
##
## distanceH
## Min. : 0.0
## 1st Qu.: 825.9
## Median : 1375.9
## Mean : 1779.9
## 3rd Qu.: 2305.2
## Max. :13812.2
##
#average precip per month
#facet wrap month, against count of number of rides as y variable. x = average precip, colour = gender
#categorize birth year into age groups. Facet wrap or colour by age groups
#rain vs trip length (end time - start time)
#bar plot of busy/not-busy stations on rainy days
#randomly try the same plot across a few select dates
ggplot(data=masterdata, aes(x=newStartDate, y=tripduration, colour=gender)) + geom_point()

#newStartDate vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_violin()
## Warning: position_dodge requires non-overlapping x intervals

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=gender)) + geom_boxplot()

#newStartDate vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=newStartDate, y=tripduration, colour=usertype)) + geom_boxplot()

#prcp vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=gender)) + geom_boxplot()

#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=tripduration, colour=usertype)) + geom_boxplot()

#prcp vs tripduration by gender
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=gender)) + geom_boxplot()

#prcp vs tripduration by usertype
ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_point()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_violin()

ggplot(data=masterdata[masterdata$tripduration < 10000,], aes(x=PRCP, y=distanceH, colour=usertype)) + geom_boxplot()
